In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.style.use('dark_background')
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_roc_curve
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from tqdm import tqdm_notebook
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
from luciferml.preprocessing import Preprocess as prep
In [2]:
data = pd.read_csv('water_potability.csv')
In [3]:
data.head()
Out[3]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [5]:
data.describe()
Out[5]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 2785.000000 3276.000000 3276.000000 3276.000000 2495.000000 3276.000000 3276.000000 3114.000000 3276.000000 3276.000000
mean 7.080795 196.369496 22014.092526 7.122277 333.775777 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.594320 32.879761 8768.570828 1.583085 41.416840 80.824064 3.308162 16.175008 0.780382 0.487849
min 0.000000 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.093092 176.850538 15666.690297 6.127421 307.699498 365.734414 12.065801 55.844536 3.439711 0.000000
50% 7.036752 196.967627 20927.833607 7.130299 333.073546 421.884968 14.218338 66.622485 3.955028 0.000000
75% 8.062066 216.667456 27332.762127 8.114887 359.950170 481.792304 16.557652 77.337473 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000
In [6]:
data.shape
Out[6]:
(3276, 10)
In [7]:
data.isnull().sum()
Out[7]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [8]:
data = data.dropna()
In [9]:
data.isnull().sum()
Out[9]:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64
In [10]:
plt.figure(figsize=(10,6)) #setting the figure size
sns.countplot(data['Potability'], palette='rocket') # checking the class count of potable water
plt.title('Potability count', weight='bold')
plt.tight_layout()
#total potability count plotted
In [11]:
non_potable = data[data['Potability']==0]
percent_non_potable = len(non_potable)/ len(data)
print('The percentage of non potable water is: {}%'.format(round(percent_non_potable * 100,4)))
The percentage of non potable water is: 59.6718%
In [12]:
data.nunique()
Out[12]:
ph                 2011
Hardness           2011
Solids             2011
Chloramines        2011
Sulfate            2011
Conductivity       2011
Organic_carbon     2011
Trihalomethanes    2011
Turbidity          2011
Potability            2
dtype: int64
In [13]:
colors = sns.color_palette('twilight')[0:6]
sns.palplot(colors)
#set up color palette for boxen plot
In [14]:
#Boxen Plot of each Column except Solids
df1 = pd.DataFrame()
df1 = data
df1 = df1.drop("Solids",1)
fig1, ax = plt.subplots(figsize=[20,10])
ax = sns.boxenplot(data=df1, orient="h", palette=colors)
sns.despine(offset=10, trim=True)
plt.title("Boxen Plot of each Column except Solids", fontsize=20);
plt.show()
In [15]:
columns = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
In [16]:
columns.shape
Out[16]:
(2011, 9)
In [17]:
def distributions(data):
    
    
    features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']
    
    plt.figure(figsize=(16,16))
    for i in tqdm_notebook(range(len(data.columns)), desc = 'loading'):
        plt.subplot(3,3,i+1)
        sns.distplot(data[data.columns[i]], color='red', rug=True)
        plt.title(data.columns[i], weight='bold')
        plt.tight_layout()
        
distributions(columns)

In [18]:
def pairplt(data):
    
    sns.pairplot(data, hue='Potability', palette='OrRd')
    plt.tight_layout()
    
pairplt(data)
In [19]:
data.hist(bins=20, color = 'green', figsize=(16,16))
plt.tight_layout()
In [20]:
data.columns
Out[20]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
In [21]:
def attributes_and_potability(data):
 
    #getting count of everything with respect to potability
    features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']
    for var in tqdm_notebook(features, desc = 'loading'):
        plt.figure(figsize=(16,10))
        sns.histplot(data = data, x = data[var], hue ='Potability') # histogram
        plt.title(var) # title of the plot
        
attributes_and_potability(data)

In [22]:
plt.figure(figsize=(16,12))
matrix = np.triu(data.corr()) # matrix to return k -th diagonal zeroed values.
sns.heatmap(data.corr(), annot=True, mask=matrix, cmap='OrRd') # creating correlational map
plt.title('Correlational Map', weight='bold');
In [23]:
def correct_skewness(data):
    
    #correct data skew
    data_set = prep.skewcorrect(data, except_columns=['Potability'])
    return data_set

data_set = correct_skewness(data) # function calling
          
██╗░░░░░██╗░░░██╗░█████╗░██╗███████╗███████╗██████╗░░░░░░░███╗░░░███╗██╗░░░░░
██║░░░░░██║░░░██║██╔══██╗██║██╔════╝██╔════╝██╔══██╗░░░░░░████╗░████║██║░░░░░
██║░░░░░██║░░░██║██║░░╚═╝██║█████╗░░█████╗░░██████╔╝█████╗██╔████╔██║██║░░░░░
██║░░░░░██║░░░██║██║░░██╗██║██╔══╝░░██╔══╝░░██╔══██╗╚════╝██║╚██╔╝██║██║░░░░░
███████╗╚██████╔╝╚█████╔╝██║██║░░░░░███████╗██║░░██║░░░░░░██║░╚═╝░██║███████╗
╚══════╝░╚═════╝░░╚════╝░╚═╝╚═╝░░░░░╚══════╝╚═╝░░╚═╝░░░░░░╚═╝░░░░░╚═╝╚══════╝

Started Preprocessor 


Skewness in numerical features: 

                 Skewness
Solids           0.595449
Conductivity     0.266670
ph               0.048910
Chloramines      0.012967
Organic_carbon  -0.020003
Turbidity       -0.033027
Sulfate         -0.046523
Trihalomethanes -0.051384
Hardness        -0.085174
Skewness Before Transformation for Solids:  0.5958940107371633 

Mean before Transformation for Solids : 21917.441374490336, Standard Deviation before Transformation for Solids : 8640.090806098791 

Skewness After Transformation for Solids:  -1.2308145151482406 

Mean before Transformation for Solids : 9.908982178445935, Standard Deviation before Transformation for Solids : 0.4428466687614959 

Skewness Before Transformation for Conductivity:  0.26686882669457 

Mean before Transformation for Conductivity : 426.5264087317783, Standard Deviation before Transformation for Conductivity : 80.69250214345881 

Skewness After Transformation for Conductivity:  -0.19929597288588363 

Mean before Transformation for Conductivity : 6.03995949808105, Standard Deviation before Transformation for Conductivity : 0.19133953254702343 

Skewness Before Transformation for ph:  0.04894678355193397 

Mean before Transformation for Ph : 7.085989839285033, Standard Deviation before Transformation for Ph : 1.572945479321659 

Skewness After Transformation for ph:  -1.1353215889866628 

Mean before Transformation for Ph : 2.069646135724062, Standard Deviation before Transformation for Ph : 0.2091935570875167 

Skewness Before Transformation for Chloramines:  0.012976277458973314 

Mean before Transformation for Chloramines : 7.134338414511035, Standard Deviation before Transformation for Chloramines : 1.5844257944238938 

Skewness After Transformation for Chloramines:  -0.8633940049210874 

Mean before Transformation for Chloramines : 2.075660803668557, Standard Deviation before Transformation for Chloramines : 0.2077454472194916 

Skewness Before Transformation for Organic_carbon:  -0.020017660786145686 

Mean before Transformation for Organic_carbon : 14.357709409067539, Standard Deviation before Transformation for Organic_carbon : 3.3241318633200096 

Skewness After Transformation for Organic_carbon:  -0.8735022417526354 

Mean before Transformation for Organic_carbon : 2.7060706485880073, Standard Deviation before Transformation for Organic_carbon : 0.23318171477866811 

Skewness Before Transformation for Turbidity:  -0.03305148365834328 

Mean before Transformation for Turbidity : 3.9697287992523864, Standard Deviation before Transformation for Turbidity : 0.7801521151552843 

Skewness After Transformation for Turbidity:  -0.5344601027379543 

Mean before Transformation for Turbidity : 1.5905102577808692, Standard Deviation before Transformation for Turbidity : 0.16262524294524067 

Skewness Before Transformation for Sulfate:  -0.046557696988421486 

Mean before Transformation for Sulfate : 333.22467188905864, Standard Deviation before Transformation for Sulfate : 41.194925817413676 

Skewness After Transformation for Sulfate:  -0.6904323262620179 

Mean before Transformation for Sulfate : 5.803934429028258, Standard Deviation before Transformation for Sulfate : 0.1272025102134412 

Skewness Before Transformation for Trihalomethanes:  -0.051422085759112356 

Mean before Transformation for Trihalomethanes : 66.4008593672628, Standard Deviation before Transformation for Trihalomethanes : 16.0731116931157 

Skewness After Transformation for Trihalomethanes:  -1.1717469055146503 

Mean before Transformation for Trihalomethanes : 4.178489111806941, Standard Deviation before Transformation for Trihalomethanes : 0.265483190023941 

Skewness Before Transformation for Hardness:  -0.08523742258053371 

Mean before Transformation for Hardness : 195.9680715571974, Standard Deviation before Transformation for Hardness : 32.62696937081507 

Skewness After Transformation for Hardness:  -0.8204384680215894 

Mean before Transformation for Hardness : 5.268413889542584, Standard Deviation before Transformation for Hardness : 0.17487336609382453 

Elapsed Time:  12.224323987960815 seconds

In [24]:
data_set.head()
Out[24]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
3 2.231816 5.372373 9.999680 2.203795 5.880215 5.897886 2.967154 4.618498 1.727891 0
4 2.311765 5.204564 9.797015 2.021097 5.740229 5.989990 2.530380 3.496447 1.624341 0
5 1.884656 5.243403 10.266382 2.145331 5.792033 5.640018 2.240681 4.023884 1.269679 0
6 2.418042 5.517741 10.266418 2.141642 5.978033 5.651266 2.693931 4.449727 1.301006 0
7 2.265490 5.319891 9.523185 1.716139 5.718046 6.164593 2.592551 4.155727 1.686663 0
In [25]:
X = data_set.iloc[:,:-1].values
y = data_set.iloc[:,-1].values

X.shape, y.shape
Out[25]:
((2011, 9), (2011,))
In [26]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X
Out[26]:
array([[ 0.77521233,  0.5944836 ,  0.20480627, ...,  1.1196562 ,
         1.65738784,  0.84476912],
       [ 1.15739242, -0.36511915, -0.25283611, ..., -0.7534488 ,
        -2.56906051,  0.20803123],
       [-0.88430311, -0.14302035,  0.80705136, ..., -1.99582197,
        -0.5823542 , -1.97282826],
       ...,
       [ 2.17675501, -4.03732299,  1.38830971, ...,  0.58861173,
        -1.61068388,  0.55453041],
       [-0.54418505, -0.19320948,  0.59214942, ..., -0.58294994,
        -0.22884644, -0.30384534],
       [-1.60039285,  0.01688199,  1.94475317, ..., -0.02180318,
         0.13716609,  0.63028525]])
In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # data splitting 80-20% to avoid over fitting
In [28]:
X_train, y_train = shuffle(X_train, y_train) # data shuffling
In [29]:
y_train.shape, y_test.shape
Out[29]:
((1608,), (403,))
In [30]:
pipeline = make_pipeline(RobustScaler()) # creating a pipeline for all the models
Random_forest = make_pipeline(pipeline, RandomForestClassifier(random_state=0, min_samples_leaf = 2, n_estimators = 1000))
Decision_tree = make_pipeline(pipeline, DecisionTreeClassifier(random_state=0))
Logistic_regression = make_pipeline(pipeline, LogisticRegression(random_state=0))
svc = make_pipeline(pipeline, SVC(random_state=0))
KNeighbors = make_pipeline(pipeline, KNeighborsClassifier())
Ada_boost = make_pipeline(pipeline, AdaBoostClassifier(random_state=0))
xgboost = make_pipeline(pipeline, XGBClassifier())
gradientboost = make_pipeline(pipeline, GradientBoostingClassifier(random_state=0))
In [31]:
param_dist = {
    'RandomForest':Random_forest,
    'DecisionTree':Decision_tree,
    'LogisticRegression':Logistic_regression,
    'svc':svc,
    'KNeighbors':KNeighbors,
    'AdaBoost':Ada_boost,
    'XGB':xgboost,
    'GD':gradientboost
}
In [32]:
def MODEL(model):
    
    #run model and generate confusion matrix 
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('The accuracy score of the model is: {}%'.format(accuracy_score(y_test, y_pred)))
    print('-'*50)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
In [33]:
def model_evaluation(parameter_dictionary):
    
  #  calculate precision, recall, f1 score
    
    for name, model in parameter_dictionary.items():
        print('-'*50)
        print(name)
        evaluation = MODEL(model)
    return evaluation
evaluation = model_evaluation(param_dist)
--------------------------------------------------
RandomForest
The accuracy score of the model is: 0.7270471464019851%
--------------------------------------------------
[[227  25]
 [ 85  66]]
              precision    recall  f1-score   support

           0       0.73      0.90      0.80       252
           1       0.73      0.44      0.55       151

    accuracy                           0.73       403
   macro avg       0.73      0.67      0.68       403
weighted avg       0.73      0.73      0.71       403

--------------------------------------------------
DecisionTree
The accuracy score of the model is: 0.6153846153846154%
--------------------------------------------------
[[175  77]
 [ 78  73]]
              precision    recall  f1-score   support

           0       0.69      0.69      0.69       252
           1       0.49      0.48      0.49       151

    accuracy                           0.62       403
   macro avg       0.59      0.59      0.59       403
weighted avg       0.61      0.62      0.62       403

--------------------------------------------------
LogisticRegression
The accuracy score of the model is: 0.6451612903225806%
--------------------------------------------------
[[251   1]
 [142   9]]
              precision    recall  f1-score   support

           0       0.64      1.00      0.78       252
           1       0.90      0.06      0.11       151

    accuracy                           0.65       403
   macro avg       0.77      0.53      0.45       403
weighted avg       0.74      0.65      0.53       403

--------------------------------------------------
svc
The accuracy score of the model is: 0.7369727047146402%
--------------------------------------------------
[[234  18]
 [ 88  63]]
              precision    recall  f1-score   support

           0       0.73      0.93      0.82       252
           1       0.78      0.42      0.54       151

    accuracy                           0.74       403
   macro avg       0.75      0.67      0.68       403
weighted avg       0.75      0.74      0.71       403

--------------------------------------------------
KNeighbors
The accuracy score of the model is: 0.6501240694789082%
--------------------------------------------------
[[196  56]
 [ 85  66]]
              precision    recall  f1-score   support

           0       0.70      0.78      0.74       252
           1       0.54      0.44      0.48       151

    accuracy                           0.65       403
   macro avg       0.62      0.61      0.61       403
weighted avg       0.64      0.65      0.64       403

--------------------------------------------------
AdaBoost
The accuracy score of the model is: 0.6054590570719603%
--------------------------------------------------
[[202  50]
 [109  42]]
              precision    recall  f1-score   support

           0       0.65      0.80      0.72       252
           1       0.46      0.28      0.35       151

    accuracy                           0.61       403
   macro avg       0.55      0.54      0.53       403
weighted avg       0.58      0.61      0.58       403

--------------------------------------------------
XGB
The accuracy score of the model is: 0.6724565756823822%
--------------------------------------------------
[[222  30]
 [102  49]]
              precision    recall  f1-score   support

           0       0.69      0.88      0.77       252
           1       0.62      0.32      0.43       151

    accuracy                           0.67       403
   macro avg       0.65      0.60      0.60       403
weighted avg       0.66      0.67      0.64       403

--------------------------------------------------
GD
The accuracy score of the model is: 0.6823821339950372%
--------------------------------------------------
[[219  33]
 [ 95  56]]
              precision    recall  f1-score   support

           0       0.70      0.87      0.77       252
           1       0.63      0.37      0.47       151

    accuracy                           0.68       403
   macro avg       0.66      0.62      0.62       403
weighted avg       0.67      0.68      0.66       403

In [34]:
accuracy_score_model = {
    'RandomForest':71.9,
    'DecisionTree':61.6,
    'LogisticRegression':63,
    'svc':73.6,
    'KNeighbors':65,
    'AdaBoost':60,
    'XGB':66,
    'GD':68
    
}
In [35]:
def models_overview(accuracy_score_model):
    
   
    #compare models to each other
    
    model_accuracy = list(accuracy_score_model.values())
    model_name = list(accuracy_score_model.keys())

    g = sns.barplot(x = model_accuracy, y = model_name,palette='OrRd')
    plt.title('Models Overview', weight='bold');
    return g
    
over_view = models_overview(accuracy_score_model)
In [36]:
svc = SVC(random_state=0)
svc.fit(X_train, y_train)
Out[36]:
SVC(random_state=0)
In [37]:
y_pred = svc.predict(X_test)
In [38]:
def svc_report(y_test, y_pred, X_test, svc):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True)
    plot_roc_curve(svc, X_test, y_test)
    print(classification_report(y_test, y_pred))
    
svc_report(y_test, y_pred, X_test, svc)
              precision    recall  f1-score   support

           0       0.73      0.93      0.82       252
           1       0.78      0.41      0.54       151

    accuracy                           0.74       403
   macro avg       0.76      0.67      0.68       403
weighted avg       0.75      0.74      0.71       403

In [39]:
print('The accuracy score of the model is: {}% '.format(round(accuracy_score(y_test, y_pred)*100, 2)))
The accuracy score of the model is: 73.7% 
In [40]:
#SVC is best model
In [ ]: